/* * Licensed to Laurent Broudoux (the "Author") under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Author licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.github.lbroudoux.elasticsearch.river.s3.connector; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import com.amazonaws.auth.InstanceProfileCredentialsProvider; import com.amazonaws.services.s3.model.*; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import com.amazonaws.auth.AWSCredentials; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3Client; import com.github.lbroudoux.elasticsearch.river.s3.river.S3RiverFeedDefinition; /** * This is a connector for querying and retrieving files or folders from * an Amazon S3 bucket. Credentials are mandatory for connecting to remote drive. * @author laurent */ public class S3Connector{ private static final ESLogger logger = Loggers.getLogger(S3Connector.class); private final String accessKey; private final String secretKey; private boolean useIAMRoleForEC2 = false; private String bucketName; private String pathPrefix; private AmazonS3Client s3Client; /** * Create a S3Connector with security credentials. This is helpful if you want * to use IAM Roles as described here http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. */ public S3Connector(boolean useIAMRoleForEC2) { this.accessKey = null; this.secretKey = null; this.useIAMRoleForEC2 = useIAMRoleForEC2; } /** * Create a SEConnector with provided security credentials. * @param accessKey The AWS access key such as provided by AWS console * @param secretKey The AWS secret key such as provided by AWS console */ public S3Connector(String accessKey, String secretKey){ this.accessKey = accessKey; this.secretKey = secretKey; } /** * Connect to the specified bucket using previously given accesskey and secretkey. * @param bucketName Name of the bucket to connect to * @param pathPrefix Prefix that will be later used for filtering documents * @throws AmazonS3Exception when access or secret keys are wrong or bucket does not exists */ public void connectUserBucket(String bucketName, String pathPrefix) throws AmazonS3Exception{ this.bucketName = bucketName; this.pathPrefix = pathPrefix; if (accessKey != null && secretKey != null) { AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey); s3Client = new AmazonS3Client(credentials); } else if (useIAMRoleForEC2) { // Force usage of IAM Role process as described into // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. s3Client = new AmazonS3Client(new InstanceProfileCredentialsProvider()); } else { // Default credentials retrieval or IAM Role process as described into // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html. s3Client = new AmazonS3Client(); } // Getting location seems odd as we don't use it later and doesBucketExists() seems // more appropriate... However, this later returns true even for non existing buckets ! s3Client.getBucketLocation(bucketName); } /** * Select and retrieves summaries of object into bucket and of given path prefix * that have modification date younger than lastScanTime. * @param lastScanTime Last modification date filter * @return Summaries of picked objects. */ public S3ObjectSummaries getObjectSummaries(Long lastScanTime){ if (logger.isDebugEnabled()){ logger.debug("Getting buckets changes since {}", lastScanTime); } List<String> keys = new ArrayList<String>(); List<S3ObjectSummary> result = new ArrayList<S3ObjectSummary>(); // Store the scan time to return before doing big queries... Long lastScanTimeToReturn = System.currentTimeMillis(); if (lastScanTime == null){ lastScanTime = 0L; } ListObjectsRequest request = new ListObjectsRequest().withBucketName(bucketName) .withPrefix(pathPrefix); ObjectListing listing = s3Client.listObjects(request); logger.debug("Listing: {}", listing); while (!listing.getObjectSummaries().isEmpty() || listing.isTruncated()){ List<S3ObjectSummary> summaries = listing.getObjectSummaries(); if (logger.isDebugEnabled()){ logger.debug("Found {} items in this listObjects page", summaries.size()); } for (S3ObjectSummary summary : summaries){ if (logger.isDebugEnabled()){ logger.debug("Getting {} last modified on {}", summary.getKey(), summary.getLastModified()); } keys.add(summary.getKey()); if (summary.getLastModified().getTime() > lastScanTime){ logger.debug(" Picked !"); result.add(summary); } } listing = s3Client.listNextBatchOfObjects(listing); } // Wrap results and latest scan time. return new S3ObjectSummaries(lastScanTimeToReturn, result, keys); } public Map<String,Object> getS3UserMetadata(String key){ return Collections.<String, Object>unmodifiableMap(s3Client.getObjectMetadata(bucketName, key).getUserMetadata()); } /** * Download Amazon S3 file as byte array. * @param summary The summary of the S3 Object to download * @return This file bytes or null if something goes wrong. */ public byte[] getContent(S3ObjectSummary summary){ if (logger.isDebugEnabled()){ logger.debug("Downloading file content from {}", summary.getKey()); } // Retrieve object corresponding to key into bucket. S3Object object = s3Client.getObject(bucketName, summary.getKey()); InputStream is = null; ByteArrayOutputStream bos = null; try{ // Get input stream on S3 Object. is = object.getObjectContent(); bos = new ByteArrayOutputStream(); byte[] buffer = new byte[4096]; int len = is.read(buffer); while (len > 0) { bos.write(buffer, 0, len); len = is.read(buffer); } // Flush and return result. bos.flush(); return bos.toByteArray(); } catch (IOException e) { e.printStackTrace(); return null; } finally { if (bos != null){ try{ bos.close(); } catch (IOException e) { } } try{ is.close(); } catch (IOException e) { } } } /** * Get the download url of this S3 object. May return null if the * object bucket and key cannot be converted to a URL. * @param summary A S3 object * @param feedDefinition The holder of S3 feed definition. * @return The resource url if possible (access is subject to AWS credential) */ public String getDownloadUrl(S3ObjectSummary summary, S3RiverFeedDefinition feedDefinition){ String resourceUrl = s3Client.getResourceUrl(summary.getBucketName(), summary.getKey()); // If a download host (actually a vhost such as cloudfront offers) is specified, use it to // recreate a vhosted resource url. This is made by substitution of the generic host name in url. if (resourceUrl != null && feedDefinition.getDownloadHost() != null){ int hostPosEnd = resourceUrl.indexOf("s3.amazonaws.com/") + "s3.amazonaws.com".length(); String vhostResourceUrl = feedDefinition.getDownloadHost() + resourceUrl.substring(hostPosEnd); return vhostResourceUrl; } return resourceUrl; } }